In this notebook I will import gene counts file that were generated by Bowtie –> FeatureCounts
getwd()
## [1] "/Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks"
source("../references/biostats.R")
list.of.packages <- c("DESeq2", "RCurl", "tidyverse", "vegan", "pheatmap", "pastecs", "factoextra", "FactoMineR", "RColorBrewer", "tibble", "reshape2", "plotly", "corrplot", "PerformanceAnalytics") #add new libraries here
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Load all libraries
lapply(list.of.packages, FUN = function(X) {
do.call("require", list(X))
})
sample.info <- read.csv("../raw-data/quantseq2020_key.csv", header=T, na.strings = NA) %>%
mutate_at(vars(lane, temp.parent), as.factor) %>%
mutate_at(vars(RNA.conc, endpoint.RFU, bioan.mean.bp), as.numeric) %>%
mutate(sample_stage=paste(sample, stage, sep="_"))
filenames <- data.frame(read.table("../qc-processing/featureCounts/filenames.txt", header = F, stringsAsFactors = F, fill = FALSE)) %>%
dplyr::rename(filename = 1) %>%
mutate(sample = as.character(gsub("_.*","", filename))) %>%
left_join(sample.info[c("sample", "sample_stage")])
## Joining, by = "sample"
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# Fill in NA value on the "Undetermined" row
filenames[filenames$filename == "Undetermined","sample_stage"] <- "Undetermined"
key <- full_join(sample.info, filenames)
## Joining, by = c("sample", "sample_stage")
## Warning: Column `sample` joining factor and character vector, coercing into
## character vector
save(key, file="../raw-data/key")
counts <- data.frame(read.table("../qc-processing/featureCounts/featurecounts-gene2kbdown.Rmatrix.txt", header = T, stringsAsFactors = F, fill = FALSE)) %>%
column_to_rownames(var="Geneid") %>%
rename_all(~as.character(filenames$sample_stage))
save(counts, file = "../results/gene-counts") #save R object with all gene counts, before filtering, removing samples, etc.
print(paste("Number of samples:", ncol(counts[,-ncol(counts)]), sep=" "))
## [1] "Number of samples: 146"
print(paste("Total number of genes in dataframe:", prettyNum(nrow(counts[,-ncol(counts)]), big.mark = ","), sep=" "))
## [1] "Total number of genes in dataframe: 32,210"
print(paste("Total counts, all samples:", prettyNum(sum(colSums(counts[,-ncol(counts)])), big.mark = ","), sep=" "))
## [1] "Total counts, all samples: 411,542,439"
#print(paste("Counts for", colnames(counts), ":", prettyNum(colSums(counts), big.mark = ","), sep=" "))
#inspect total counts by sample
#ggplotly(
ggplot(data.frame(colSums(counts[,-ncol(counts)])) %>% dplyr::rename(count.total = 1) %>% rownames_to_column(var="sample")) + geom_bar(aes(x=sample, y=count.total), stat = "identity") + ggtitle("Total count by sample") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) #)
multiqc.stats <- read.delim(file="../qc-processing/fastqc/trimmed/multiqc_report_trimmed_data/multiqc_general_stats.txt", sep = "\t")
colnames(multiqc.stats) <- gsub("FastQC_mqc.generalstats.fastqc.", "", colnames(multiqc.stats))
multiqc.stats <- multiqc.stats %>%
mutate(total_unique = total_sequences*(100-percent_duplicates)/100) %>%
mutate(Sample = gsub(".trim", "", Sample)) %>%
left_join(key, by=c("Sample"="sample"))
No. of PCR cycles strongly correlates positively % duplicates & % fails and negatively with total unique reads, less strongly with GC content.
cor(multiqc.stats %>%drop_na(population) %>% dplyr::select_if(is.numeric)) %>% corrplot(tl.cex=.45)
## Look closer at pcr cycles and read stummary stats
chart.Correlation(multiqc.stats %>%drop_na(population) %>%
select(pcr.cycles, percent_duplicates,
percent_gc, percent_fails, total_unique),
histogram=F, pch=19)
It looks like the % dupliates increases with PCR cycles, so I presume that there are artifacts of PCR overamplification. Since i have SE reads w/o molecular identifiers, 100bp dont remove duplicates, but proceed with caution - 10.7717/peerj.3091
ggplotly(
ggplot(multiqc.stats %>%drop_na(population),
aes(x=pcr.cycles, y=percent_duplicates,
label=Sample, color=stage, shape=pCO2.parent)) +
geom_point(size=3))
#scale_color_manual(values=c("black", "red")) +
# How many of each tissue/population/treatment would I throw out?
multiqc.stats %>% filter(pcr.cycles>17) %>% group_by(stage, population, pCO2.parent) %>% count() %>% View()
## Warning: Factor `pCO2.parent` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `pCO2.parent` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1
# How many would I retain?
multiqc.stats %>% filter(pcr.cycles<=17) %>% group_by(stage, population, pCO2.parent) %>% count() %>% View()
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1
# this looks fine only big discrepancy is in Fidalgo Bay larvae
cor(multiqc.stats %>%drop_na(population) %>% filter(pcr.cycles<=17) %>% dplyr::select_if(is.numeric)) %>% corrplot(tl.cex=.45)
chart.Correlation(multiqc.stats %>%drop_na(population) %>% filter(pcr.cycles<=17) %>%
select(pcr.cycles, percent_duplicates,
percent_gc, percent_fails, total_unique),
histogram=F, pch=19)
# much better.
Samples 35, 412, and 44 have very low counts, similar to sample 571 which was the no-tissue control, prepared alongside larval samples.
remove.list <- c("Undetermined", multiqc.stats %>% filter(pcr.cycles>17) %>% select(sample_stage) %>% unlist() %>% as.vector())
counts.filtered <- counts[ , -which(names(counts) %in% remove.list)]
ncol(counts.filtered)
## [1] 129
key <- key[ -which(key$sample_stage %in% remove.list), ]
nrow(key)
## [1] 129
nrow(key) == ncol(counts.filtered) #should = TRUE.
## [1] TRUE
ggplotly(ggplot(data = data.frame(colSums(counts.filtered)) %>%
dplyr::rename(count.total = 1) %>%
rownames_to_column(var="sample")) +
geom_bar(aes(x=sample, y=count.total), stat = "identity") + ggtitle("Total count by sample") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()))
## OPTIONAL: Remove genes that were in the No-Tissue Control, look at counts again
# remove.genes <- counts %>% select("571_larvae") %>% rownames_to_column("gene") %>% filter(`571_larvae` > 100) %>% select(gene) %>% unlist() %>% as.vector()
# counts.filtered <- counts #[-which(rownames(counts) %in% remove.genes),]
counts.t <- t(counts.filtered) #transform data to have each sample a row, each column a gene
save(counts.filtered, file = "../results/gene-counts-filtered")
save(counts.t, file = "../results/gene-counts-trans")
# Read in O. lurida gene file that connects OLUR gene ID to uniprot accession number
Olurida_gene_uniprot <- read_delim(file = here::here("references", "Olurida_v081-20190709-UniprotID.gff"), skip = 1, delim = "\t", col_names = c("contig", "source", "feature", "start", "end", "unknown1", "strand", "unknown2", "notes")) %>%
# Split giant gene "Notes" column into separate columns
mutate(ID=str_extract(notes, "ID=(.*?);"),
Parent=str_extract(notes, "Parent=(.*?);"),
Name=str_extract(notes, "Name=(.*?);"),
Alias=str_extract(notes, "Alias=(.*?);"),
AED=str_extract(notes, "AED=(.*?);"),
eAED=str_extract(notes, "eAED=(.*?);"),
Note=str_extract(notes, "Note=(.*?);"),
Ontology_term=str_extract(notes, "Ontology_term=(.*?);"),
Dbxref=str_extract(notes, "Dbxref=(.*?);"),
SPID=str_extract(notes, "SPID=(.*?);")
) %>%
#remove extraneous info from Olur gene ID and Uniprot species ID ("SPID")
mutate(Name=str_remove(Name, "Name=")) %>% mutate(Name=str_remove(Name, ";")) %>%
mutate(SPID=str_remove(SPID, "SPID=")) %>% mutate(SPID=str_remove(SPID, ";"))
## Warning in readLines(f, n): line 1 appears to contain an embedded nul
## Warning in readLines(f, n): incomplete final line found on '/Volumes/Bumblebee/
## O.lurida_QuantSeq-2020/._QuantSeq-04-21-2020.Rproj'
## Parsed with column specification:
## cols(
## contig = col_character(),
## source = col_character(),
## feature = col_character(),
## start = col_double(),
## end = col_double(),
## unknown1 = col_character(),
## strand = col_character(),
## unknown2 = col_character(),
## notes = col_character()
## )
save(Olurida_gene_uniprot, file = "../references/Olurida_gene_uniprot")